Data Cleaning
ecls <- ecls_raw[, .(
childid = childid,
wave = wave,
# time variable (0, 1, 2)
time = fcase(
wave == 2, 0, # Spring Kindergarten (baseline)
wave == 4, 1, # Spring 1st Grade
wave == 9, 2, # Spring 5th Grade
default = NA_real_
),
# Outcomes
math = math_score,
science = science_score,
# Food security variables
fs_raw = fs_raw,
fs_scale = fs_scale,
fs_status = fs_status,
# Demographics (time-invariant)
sex = x_chsex_r,
race = X_RACETHP_R,
# SES (baseline)
ses_baseline = x12sesl,
# Parent education
parent1_ed = parent1_ed,
parent2_ed = parent2_ed,
# School characteristics
school_type = school_type,
urbanicity = locale,
# Additional controls
household_size = household_size,
disability = disability
)]
# Clean missing codes
# Handle food security scale scores
# -6 represents food secure (no items affirmed), not missing
# Recode to minimum valid value or create indicator
ecls[fs_scale == -6, fs_scale := 1.4] # Most conservative: assign lowest valid value
# NOW remove actual missing codes (< -6)
ecls[fs_scale < -6, fs_scale := NA_real_] # Only removes -7, -8, -9, etc.
ecls[fs_raw < 0, fs_raw := NA_real_]
ecls[math < 0 | is.na(math), math := NA_real_]
ecls[science < 0 | is.na(science), science := NA_real_]
# Baseline food security variable (wave 2 value for each child)
ecls[, fs_baseline := fs_scale[wave == 2][1], by = childid]
ecls[, fs_status_baseline := fs_status[wave == 2][1], by = childid]
# If a child has no wave 2, use their first available FS value as baseline
ecls[is.na(fs_baseline), fs_baseline := fs_scale[!is.na(fs_scale)][1], by = childid]
ecls[is.na(fs_status_baseline), fs_status_baseline := fs_status[!is.na(fs_status)][1], by = childid]
# Food security change variable (current - baseline)
ecls[, fs_change := fs_scale - fs_baseline]
# Cumulative exposure variable (count of waves with low/very low FS)
ecls[, fs_insecure := as.numeric(fs_status %in% c(2, 3))] # 1 if insecure
ecls[, fs_cumulative := cumsum(replace(fs_insecure, is.na(fs_insecure), 0)), by = childid]
# Create SES quartiles for moderation analysis
ecls[, ses_quartile := cut(ses_baseline,
breaks = quantile(ses_baseline, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE),
labels = c("Q1_Lowest", "Q2", "Q3", "Q4_Highest"),
include.lowest = TRUE)]
# Convert categorical variables to factors
# Note: Data already has string labels, so we clean and simplify
ecls[, sex := gsub(":.*", "", sex)] # Extract "1" or "2"
ecls[sex == "1", sex := "Male"]
ecls[sex == "2", sex := "Female"]
ecls[, sex := factor(sex, levels = c("Male", "Female"))]
ecls[, fs_status_factor := factor(fs_status, levels = 1:3,
labels = c("High/Marginal", "Low", "Very Low"))]
ecls[, disability_clean := gsub(":.*", "", disability)]
ecls[disability_clean == "1", disability_clean := "Yes"]
ecls[disability_clean == "2", disability_clean := "No"]
ecls[, disability := factor(disability_clean, levels = c("Yes", "No"))]
ecls[, disability_clean := NULL]
# Sort
setorder(ecls, childid, time)
cat("\n", rep("=", 80), "\n", sep="")
##
## ================================================================================
cat("DATA PREPARATION COMPLETE\n")
## DATA PREPARATION COMPLETE
cat(rep("=", 80), "\n", sep="")
## ================================================================================
cat("Observations after cleaning:", nrow(ecls), "\n")
## Observations after cleaning: 54522
cat("Children with at least one observation:", uniqueN(ecls$childid), "\n")
## Children with at least one observation: 18174
# Sample sizes by wave
sample_sizes <- ecls[, .(
n_children = uniqueN(childid),
n_with_math = sum(!is.na(math)),
n_with_science = sum(!is.na(science)),
n_with_fs = sum(!is.na(fs_scale)),
pct_complete = round(100 * sum(!is.na(math) & !is.na(fs_scale)) / uniqueN(childid), 1)
), by = wave]
print(kable(sample_sizes, caption = "Sample Sizes by Wave", digits = 1))
##
##
## Table: Sample Sizes by Wave
##
## | wave| n_children| n_with_math| n_with_science| n_with_fs| pct_complete|
## |----:|----------:|-----------:|--------------:|---------:|------------:|
## | 2| 18174| 17143| 16936| 12910| 68.7|
## | 4| 18174| 15103| 15072| 12313| 65.2|
## | 9| 18174| 11426| 11419| 9308| 46.9|
# Outcome means by wave
outcome_means <- ecls[, .(
Math_Mean = round(mean(math, na.rm = TRUE), 2),
Math_SD = round(sd(math, na.rm = TRUE), 2),
Science_Mean = round(mean(science, na.rm = TRUE), 2),
Science_SD = round(sd(science, na.rm = TRUE), 2),
FS_Scale_Mean = round(mean(fs_scale, na.rm = TRUE), 2),
FS_Scale_SD = round(sd(fs_scale, na.rm = TRUE), 2)
), by = wave]
cat("\n")
print(kable(outcome_means, caption = "Outcome Variables by Wave", digits = 2))
##
##
## Table: Outcome Variables by Wave
##
## | wave| Math_Mean| Math_SD| Science_Mean| Science_SD| FS_Scale_Mean| FS_Scale_SD|
## |----:|---------:|-------:|------------:|----------:|-------------:|-----------:|
## | 2| 49.86| 13.34| 33.48| 7.38| 1.93| 1.40|
## | 4| 72.25| 15.73| 42.36| 10.36| 1.88| 1.36|
## | 9| 119.66| 17.79| 73.17| 13.04| 1.74| 1.18|
# Food security status prevalence
fs_prevalence <- ecls[!is.na(fs_status_factor),
.(N = .N),
by = .(wave, fs_status_factor)
][, Percentage := round(100 * N / sum(N), 1), by = wave]
cat("\n")
print(kable(fs_prevalence, caption = "Food Security Status Distribution", digits = 1))
##
##
## Table: Food Security Status Distribution
##
## | wave|fs_status_factor | N| Percentage|
## |----:|:----------------|-----:|----------:|
## | 2|High/Marginal | 11292| 87.5|
## | 4|High/Marginal | 10911| 88.6|
## | 9|High/Marginal | 8583| 92.2|
## | 9|Low | 526| 5.7|
## | 2|Very Low | 369| 2.9|
## | 4|Low | 1059| 8.6|
## | 2|Low | 1249| 9.7|
## | 4|Very Low | 343| 2.8|
## | 9|Very Low | 199| 2.1|
# Baseline characteristics
cat("\n\nBASELINE CHARACTERISTICS (Wave 2):\n")
##
##
## BASELINE CHARACTERISTICS (Wave 2):
baseline <- ecls[wave == 2]
cat("Sex:\n")
## Sex:
print(table(baseline$sex, useNA = "ifany"))
##
## Male Female <NA>
## 9288 8847 39
cat("\nSES Quartiles:\n")
##
## SES Quartiles:
print(table(baseline$ses_quartile, useNA = "ifany"))
##
## Q1_Lowest Q2 Q3 Q4_Highest <NA>
## 4012 3995 4015 3983 2169
cat("\nDisability:\n")
##
## Disability:
print(table(baseline$disability, useNA = "ifany"))
##
## Yes No <NA>
## 2566 10473 5135